import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
hdb = pd.read_csv('HDB_Complete_dataset.csv')
dataset_features = hdb[['resale_price', 'town_cat','storey_range','flat_type' ,'floor_area_sqm', 'Planning Area','total_dwelling_units', 'remaining_lease_yrs',
'distance_secondary_school','distance_primary_school', 'distance_mrt', 'distance_supermarket', 'distance_hawker',
'distance_city', 'distance_npc', 'distance_cc','commercial','Mature_Estate']]
print(len(dataset_features))
# Only "y varable"
resale_p = dataset_features['resale_price']
# All other indepdendent variables
X = dataset_features[['flat_type' , 'town_cat','storey_range','floor_area_sqm', 'Planning Area','total_dwelling_units', 'remaining_lease_yrs',
'distance_secondary_school','distance_primary_school', 'distance_mrt', 'distance_supermarket', 'distance_hawker',
'distance_city', 'distance_npc', 'distance_cc','commercial','Mature_Estate']]
dataset_features.head()
111824
| resale_price | town_cat | storey_range | flat_type | floor_area_sqm | Planning Area | total_dwelling_units | remaining_lease_yrs | distance_secondary_school | distance_primary_school | distance_mrt | distance_supermarket | distance_hawker | distance_city | distance_npc | distance_cc | commercial | Mature_Estate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 232000.0 | 0.0 | 11.0 | 2.0 | 44.0 | 2.0 | 220 | 61.679452 | 0.576105 | 0.089405 | 1.000219 | 0.748675 | 0.172411 | 7.612266 | 0.828923 | 0.27193 | 0 | 1.0 |
| 1 | 235000.0 | 0.0 | 11.0 | 2.0 | 44.0 | 2.0 | 220 | 61.084932 | 0.576105 | 0.089405 | 1.000219 | 0.748675 | 0.172411 | 7.612266 | 0.828923 | 0.27193 | 0 | 1.0 |
| 2 | 202000.0 | 0.0 | 2.0 | 2.0 | 44.0 | 2.0 | 220 | 60.339726 | 0.576105 | 0.089405 | 1.000219 | 0.748675 | 0.172411 | 7.612266 | 0.828923 | 0.27193 | 0 | 1.0 |
| 3 | 210000.0 | 0.0 | 2.0 | 2.0 | 44.0 | 2.0 | 220 | 60.339726 | 0.576105 | 0.089405 | 1.000219 | 0.748675 | 0.172411 | 7.612266 | 0.828923 | 0.27193 | 0 | 1.0 |
| 4 | 220000.0 | 0.0 | 6.0 | 2.0 | 44.0 | 2.0 | 220 | 60.084932 | 0.576105 | 0.089405 | 1.000219 | 0.748675 | 0.172411 | 7.612266 | 0.828923 | 0.27193 | 0 | 1.0 |
#Have correlation analysis for resale price with all variables:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
corrMatrix = dataset_features.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrMatrix, xticklabels=corrMatrix.columns, yticklabels=corrMatrix.columns, cmap='coolwarm', annot=True)
<AxesSubplot:>
import seaborn as sns
sns.pairplot(dataset_features,diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x7f99d39f7340>
from sklearn.linear_model import LinearRegression
from sklearn import ensemble
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
models_name=['LinearRegression','GBRT','RandomForest']
models=[]
models.append(LinearRegression())
models.append(ensemble.GradientBoostingRegressor(n_estimators=100))
models.append(ensemble.RandomForestRegressor(n_estimators=200))
np.random.seed(100)
X_train, X_test, y_train, y_test = train_test_split(X, resale_p, test_size=.3, random_state=0)
MSE_lst = []
for i in models:
i.fit(X_train, y_train) # Train the model using the training sets
y_pred = i.predict(X_test) # Make predictions using the testing set
MSE = mean_squared_error(y_test, y_pred) # performance statistic
MSE_lst.append(MSE)
np.random.seed(100)
regre = LinearRegression()
regre.fit(X_train, y_train) # Train the model using the training sets
y_pred_ols = regre.predict(X_test) # Make predictions using the testing set
MSE_ols = mean_squared_error(y_test, y_pred_ols) # performance statistic
MSE_lst.append(MSE_ols)
score_regre = regre.score(X_test,y_test)
print('OLS_MSE =', MSE_ols)
print('OLS_R squared =', score_regre)
from sklearn.ensemble import GradientBoostingRegressor
GradientBoostingRegressor = GradientBoostingRegressor()
GradientBoostingRegressor.fit(X=X_train, y=y_train)
y_pred_GradientBoostingRegressor = GradientBoostingRegressor.predict(X=X_test)
MSE_GradientBoostingRegressor = mean_squared_error(y_test, y_pred_GradientBoostingRegressor)
MSE_lst.append(MSE_GradientBoostingRegressor)
score_GBR = GradientBoostingRegressor.score(X_test,y_test)
print('GBR_MSE =', MSE_GradientBoostingRegressor)
print('GBR_R squared =', score_GBR)
from sklearn.ensemble import RandomForestRegressor
RandomForestRegressor = RandomForestRegressor()
RandomForestRegressor.fit(X=X_train, y=y_train)
y_pred_RandomForestRegressor = RandomForestRegressor.predict(X=X_test)
MSE_RandomForestRegressor = mean_squared_error(y_test, y_pred_RandomForestRegressor)
MSE_lst.append(MSE_RandomForestRegressor)
score_RFR = RandomForestRegressor.score(X_test,y_test)
print('RFR_MSE =',MSE_RandomForestRegressor)
print('RFR_R squared =', score_RFR)
OLS_MSE = 4387516252.337464 OLS_R squared = 0.8252954926006144 GBR_MSE = 2378680647.040532 GBR_R squared = 0.9052844008315015 RFR_MSE = 896194756.3756748 RFR_R squared = 0.9643148299762737
OLS_df = pd.DataFrame(data= { 'resale_price': y_test,'predicted_price': y_pred_ols})
OLS_df.head()
| resale_price | predicted_price | |
|---|---|---|
| 32738 | 390000.0 | 451258.310654 |
| 73042 | 491000.0 | 595666.431557 |
| 82527 | 250000.0 | 299178.037057 |
| 6719 | 300000.0 | 258900.057618 |
| 52230 | 255000.0 | 239525.766321 |
OLS_df.to_csv('OLS_HDB.csv',index=False)
#Summary of OLS model
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: resale_price R-squared: 0.826
Model: OLS Adj. R-squared: 0.826
Method: Least Squares F-statistic: 2.181e+04
Date: Sat, 20 Nov 2021 Prob (F-statistic): 0.00
Time: 19:55:50 Log-Likelihood: -9.7962e+05
No. Observations: 78276 AIC: 1.959e+06
Df Residuals: 78258 BIC: 1.959e+06
Df Model: 17
Covariance Type: nonrobust
=============================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------
const -3.231e+05 2692.377 -120.019 0.000 -3.28e+05 -3.18e+05
flat_type 2.229e+04 864.990 25.770 0.000 2.06e+04 2.4e+04
town_cat -149.9141 34.632 -4.329 0.000 -217.792 -82.036
storey_range 4475.4955 44.435 100.720 0.000 4388.403 4562.588
floor_area_sqm 3802.5956 33.416 113.796 0.000 3737.101 3868.090
Planning Area 1.181e+04 231.766 50.978 0.000 1.14e+04 1.23e+04
total_dwelling_units 45.9253 4.705 9.761 0.000 36.703 55.147
remaining_lease_yrs 5391.6492 24.170 223.068 0.000 5344.275 5439.023
distance_secondary_school 1.045e+04 863.666 12.101 0.000 8758.246 1.21e+04
distance_primary_school 2.137e+04 1086.030 19.675 0.000 1.92e+04 2.35e+04
distance_mrt -4.593e+04 685.049 -67.046 0.000 -4.73e+04 -4.46e+04
distance_supermarket -5477.6949 1221.483 -4.484 0.000 -7871.794 -3083.596
distance_hawker -2.145e+04 214.965 -99.791 0.000 -2.19e+04 -2.1e+04
distance_city -9677.6939 89.837 -107.725 0.000 -9853.773 -9501.614
distance_npc -1.166e+04 456.991 -25.521 0.000 -1.26e+04 -1.08e+04
distance_cc -6502.2467 716.877 -9.070 0.000 -7907.322 -5097.172
commercial -5189.0543 640.026 -8.108 0.000 -6443.502 -3934.606
Mature_Estate 5.068e+04 783.528 64.681 0.000 4.91e+04 5.22e+04
==============================================================================
Omnibus: 9269.844 Durbin-Watson: 1.996
Prob(Omnibus): 0.000 Jarque-Bera (JB): 18158.262
Skew: 0.764 Prob(JB): 0.00
Kurtosis: 4.798 Cond. No. 2.10e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.1e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
#Residual for Linear regression model
regre.fit(X_train,y_train)
y_train_pred=regre.predict(X_train)
y_test_pred=regre.predict(X_test)
#Evaluate model performance via MSE and R2_Score
from sklearn.metrics import mean_squared_error,r2_score
print("MSE Train: %.3f, Test: %.3f" % (mean_squared_error(y_train,y_train_pred),
mean_squared_error(y_test,y_test_pred)))
print("R2_Score Train: %.3f, Test: %.3f" % (r2_score(y_train,y_train_pred),
r2_score(y_test,y_test_pred)))
#Visualize the residuals of the prediction
plt.scatter(y_train_pred,y_train_pred-y_train,
c='steelblue',
edgecolor='white',
marker='o',
s=35,
alpha=0.9,
label='Training Data')
plt.scatter(y_test_pred,y_test_pred-y_test,
c='limegreen',
edgecolor='white',
marker='s',
s=30,
alpha=0.9,
label='Test Data')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Linear Regression Residual plot for HDB")
plt.legend(loc='upper left')
plt.hlines(y=0,xmin=0,xmax=1200000,lw=2,color='black')
plt.xlim([0,1200000])
plt.show()
MSE Train: 4343664017.745, Test: 4387516252.337 R2_Score Train: 0.826, Test: 0.825
import matplotlib.pyplot as plt
plt.figure(figsize=(8,18))
for i in range(len(models)):
plt.subplot(311+i)
ax=plt.gca()
y_pred=models[i].predict(X_test)
ax.plot(range(len(y_test[:100])), y_test[:100],'k--',label='Actual')
ax.plot(range(len(y_pred[:100])), y_pred[:100], 'k',label='Predicted')
plt.title('%s'%models_name[i])
#Residual for Random Forest before parameter adjustment
RandomForestRegressor.fit(X_train,y_train)
y_train_pred=RandomForestRegressor.predict(X_train)
y_test_pred=RandomForestRegressor.predict(X_test)
#Evaluate model performance via MSE and R2_Score
from sklearn.metrics import mean_squared_error,r2_score
print("MSE Train: %.3f, Test: %.3f" % (mean_squared_error(y_train,y_train_pred),
mean_squared_error(y_test,y_test_pred)))
print("R2_Score Train: %.3f, Test: %.3f" % (r2_score(y_train,y_train_pred),
r2_score(y_test,y_test_pred)))
#Visualize the residuals of the prediction
plt.scatter(y_train_pred,y_train_pred-y_train,
c='steelblue',
edgecolor='white',
marker='o',
s=35,
alpha=0.9,
label='Training Data')
plt.scatter(y_test_pred,y_test_pred-y_test,
c='limegreen',
edgecolor='white',
marker='s',
s=30,
alpha=0.9,
label='Test Data')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Random Forest Residual plot for HDB")
plt.legend(loc='upper left')
plt.hlines(y=0,xmin=0,xmax=1300000,lw=2,color='black')
plt.xlim([0,1300000])
plt.show()
MSE Train: 127027355.059, Test: 895278715.106 R2_Score Train: 0.995, Test: 0.964
def test_RandomForestRegressor_num(*data):
X_train,X_test,y_train,y_test=data
nums=np.arange(1,100,step=2)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for num in nums:
regr=ensemble.RandomForestRegressor(n_estimators=num)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(nums,training_scores,label="Training Score")
ax.plot(nums,testing_scores,label="Testing Score")
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(-1,1)
plt.suptitle("RandomForestRegressor HDB")
plt.show()
# test_RandomForestRegressor_num
test_RandomForestRegressor_num(X_train,X_test,y_train,y_test)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train.astype('int'))
X_test_norm = mms.transform(X_test)
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train.astype('int'))
X_test_std = stdsc.transform(X_test)
feat_labels = dataset_features.columns[1:]
forest = RandomForestClassifier(n_estimators=25,
random_state=1)
forest.fit(X_train, y_train.astype('int'))
importances = forest.feature_importances_
print(importances)
[0.0050999 0.00920884 0.15994133 0.06662063 0.00401092 0.03840674 0.32587532 0.04762611 0.04767981 0.04862806 0.04718655 0.0471266 0.04834136 0.04799394 0.04769697 0.00741127 0.00114564]
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 60,
feat_labels[indices[f]],
importances[indices[f]]))
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]),
importances[indices],
align='center')
plt.xticks(range(X_train.shape[1]),
feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
#plt.savefig('images/04_09.png', dpi=300)
plt.show()
1) remaining_lease_yrs 0.325875 2) flat_type 0.159941 3) floor_area_sqm 0.066621 4) distance_mrt 0.048628 5) distance_city 0.048341 6) distance_npc 0.047994 7) distance_cc 0.047697 8) distance_primary_school 0.047680 9) distance_secondary_school 0.047626 10) distance_supermarket 0.047187 11) distance_hawker 0.047127 12) total_dwelling_units 0.038407 13) storey_range 0.009209 14) commercial 0.007411 15) town_cat 0.005100 16) Planning Area 0.004011 17) Mature_Estate 0.001146
sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
X_selected = sfm.transform(X_train)
print('Number of features that meet this threshold criterion:',
X_selected.shape[1])
for f in range(X_selected.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[indices[f]],
importances[indices[f]]))
Number of features that meet this threshold criterion: 2 1) remaining_lease_yrs 0.325875 2) flat_type 0.159941
##Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
max_features = [.1,.3,.5,.7,.9,.99]#trying a series of parameters
test_scores = []
for max_feat in tqdm(max_features):
clf = RandomForestRegressor(n_estimators = 200,max_features = max_feat)
test_score = np.sqrt(-cross_val_score(clf,X_train,y_train,cv = 5,scoring = 'neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
100%|██████████| 6/6 [25:37<00:00, 256.21s/it]
#result
import matplotlib.pyplot as plt
plt.plot(max_features,test_scores)
plt.title('Max Features vs CV Error')
#The possible optimal parameter may be 0.7
Text(0.5, 1.0, 'Max Features vs CV Error')
from sklearn.ensemble import RandomForestRegressor
rfg = RandomForestRegressor(n_estimators = 200,max_features =0.7)
rfg.fit(X_train, y_train)
y_rfg = rfg.predict(X_test)
submission_df = pd.DataFrame(data= { 'y_test': y_test,'predicted_price': y_rfg})
print(rfg.get_params())
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 0.7, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
submission_df.head()
| y_test | predicted_price | |
|---|---|---|
| 32738 | 390000.0 | 370352.76 |
| 73042 | 491000.0 | 503145.00 |
| 82527 | 250000.0 | 268428.32 |
| 6719 | 300000.0 | 273718.52 |
| 52230 | 255000.0 | 265770.00 |
from numpy import mean
average1 = mean(y_test)
average2 = mean(y_rfg)
max1 = np.max(y_test)
max2 = np.max(y_rfg)
print('Average y_test =', average1)
print('Average predicted_price =', average2)
print('Maximum y_test =', max1)
print('Maximum y_rfg =', max2)
Average y_test = 456943.7768200787 Average predicted_price = 456295.259636511 Maximum y_test = 1240000.0 Maximum y_rfg = 1214205.88
submission_df.to_csv('rfg_HDB.csv',index=False)
ph = pd.read_csv('Private_Housing_Complete_dataset.csv')
/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (23) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
dataset_features = ph[['abs_price', 'Postal District','Type_no','Floor_no' ,'floor_area_sqm', 'Planning Area_no', 'remaining_lease_yrs',
'distance_secondary_school','distance_primary_school', 'distance_mrt', 'distance_supermarket', 'distance_hawker',
'distance_city', 'distance_npc', 'distance_cc','Mature_Estate']]
print(len(dataset_features))
# Only "y varable"
resale_p = dataset_features['abs_price']
# All other indepdendent variables
X = dataset_features[['Postal District','Type_no','Floor_no' ,'floor_area_sqm', 'Planning Area_no', 'remaining_lease_yrs',
'distance_secondary_school','distance_primary_school', 'distance_mrt', 'distance_supermarket', 'distance_hawker',
'distance_city', 'distance_npc', 'distance_cc','Mature_Estate']]
dataset_features.head()
49613
| abs_price | Postal District | Type_no | Floor_no | floor_area_sqm | Planning Area_no | remaining_lease_yrs | distance_secondary_school | distance_primary_school | distance_mrt | distance_supermarket | distance_hawker | distance_city | distance_npc | distance_cc | Mature_Estate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3290240.0 | 1 | 1 | 48 | 1356.0 | 5 | 89 | 1.634421 | 1.012462 | 0.374709 | 0.601335 | 0.376115 | 1.817501 | 0.260453 | 0.823495 | 1 |
| 1 | 2487680.0 | 1 | 1 | 43 | 1216.0 | 5 | 89 | 1.634421 | 1.012462 | 0.374709 | 0.601335 | 0.376115 | 1.817501 | 0.260453 | 0.823495 | 1 |
| 2 | 2130000.0 | 1 | 1 | 53 | 883.0 | 5 | 89 | 1.634421 | 1.012462 | 0.374709 | 0.601335 | 0.376115 | 1.817501 | 0.260453 | 0.823495 | 1 |
| 3 | 2990000.0 | 1 | 1 | 38 | 1356.0 | 5 | 89 | 1.634421 | 1.012462 | 0.374709 | 0.601335 | 0.376115 | 1.817501 | 0.260453 | 0.823495 | 1 |
| 4 | 3150000.0 | 1 | 1 | 38 | 1518.0 | 5 | 89 | 1.634421 | 1.012462 | 0.374709 | 0.601335 | 0.376115 | 1.817501 | 0.260453 | 0.823495 | 1 |
import seaborn as sns
sns.pairplot(dataset_features,diag_kind='kde')
/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:306: UserWarning: Dataset has 0 variance; skipping density estimate. warnings.warn(msg, UserWarning)
<seaborn.axisgrid.PairGrid at 0x7fd95efbf4c0>
from sklearn.linear_model import LinearRegression
from sklearn import ensemble
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
models_name=['LinearRegression','GBRT','RandomForest']
models=[]
models.append(LinearRegression())
models.append(ensemble.GradientBoostingRegressor(n_estimators=100))
models.append(ensemble.RandomForestRegressor(n_estimators=40))
np.random.seed(100)
X_train, X_test, y_train, y_test = train_test_split(X, resale_p, test_size=.3, random_state=0)
MSE_lst = []
for i in models:
i.fit(X_train, y_train) # Train the model using the training sets
y_pred = i.predict(X_test) # Make predictions using the testing set
MSE = mean_squared_error(y_test, y_pred) # performance statistic
MSE_lst.append(MSE)
np.random.seed(100)
regre = LinearRegression()
regre.fit(X_train, y_train) # Train the model using the training sets
y_pred_ols = regre.predict(X_test) # Make predictions using the testing set
MSE_ols = mean_squared_error(y_test, y_pred_ols) # performance statistic
MSE_lst.append(MSE_ols)
score_regre = regre.score(X_test,y_test)
print('OLS_MSE =', MSE_ols)
print('OLS_R squared =', score_regre)
from sklearn.ensemble import GradientBoostingRegressor
GradientBoostingRegressor = GradientBoostingRegressor()
GradientBoostingRegressor.fit(X=X_train, y=y_train)
y_pred_GradientBoostingRegressor = GradientBoostingRegressor.predict(X=X_test)
MSE_GradientBoostingRegressor = mean_squared_error(y_test, y_pred_GradientBoostingRegressor)
MSE_lst.append(MSE_GradientBoostingRegressor)
score_GBR = GradientBoostingRegressor.score(X_test,y_test)
print('GBR_MSE =', MSE_GradientBoostingRegressor)
print('GBR_R squared =', score_GBR)
from sklearn.ensemble import RandomForestRegressor
RandomForestRegressor = RandomForestRegressor()
RandomForestRegressor.fit(X=X_train, y=y_train)
y_pred_RandomForestRegressor = RandomForestRegressor.predict(X=X_test)
MSE_RandomForestRegressor = mean_squared_error(y_test, y_pred_RandomForestRegressor)
MSE_lst.append(MSE_RandomForestRegressor)
score_RFR = RandomForestRegressor.score(X_test,y_test)
print('RFR_MSE =',MSE_RandomForestRegressor)
print('RFR_R squared =', score_RFR)
OLS_MSE = 1443949572569.5076 OLS_R squared = 0.2443561925130916 GBR_MSE = 149540294048.7973 GBR_R squared = 0.921742975437388 RFR_MSE = 47068756485.42806 RFR_R squared = 0.9753681049255541
#RandomForest has better accuracy
import matplotlib.pyplot as plt
plt.figure(figsize=(8,18))
for i in range(len(models)):
plt.subplot(311+i)
ax=plt.gca()
y_pred=models[i].predict(X_test)
ax.plot(range(len(y_test[:100])), y_test[:100],'k--',label='Actual')
ax.plot(range(len(y_pred[:100])), y_pred[:100], 'k',label='Predicted')
plt.title('%s'%models_name[i])
RandomForestRegressor.fit(X_train,y_train)
y_train_pred=RandomForestRegressor.predict(X_train)
y_test_pred=RandomForestRegressor.predict(X_test)
#Evaluate model performance via MSE and R2_Score
from sklearn.metrics import mean_squared_error,r2_score
print("MSE Train: %.3f, Test: %.3f" % (mean_squared_error(y_train,y_train_pred),
mean_squared_error(y_test,y_test_pred)))
print("R2_Score Train: %.3f, Test: %.3f" % (r2_score(y_train,y_train_pred),
r2_score(y_test,y_test_pred)))
#Visualize the residuals of the prediction
plt.scatter(y_train_pred,y_train_pred-y_train,
c='steelblue',
edgecolor='white',
marker='o',
s=35,
alpha=0.9,
label='Training Data')
plt.scatter(y_test_pred,y_test_pred-y_test,
c='limegreen',
edgecolor='white',
marker='s',
s=30,
alpha=0.9,
label='Test Data')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Random Forest Residual plot for Private")
plt.legend(loc='upper left')
plt.hlines(y=0,xmin=0,xmax=22000000,lw=2,color='black')
plt.xlim([0,22000000])
plt.show()
MSE Train: 29812585919.726, Test: 48036221156.803 R2_Score Train: 0.987, Test: 0.975
def test_RandomForestRegressor_num(*data):
X_train,X_test,y_train,y_test=data
nums=np.arange(1,100,step=2)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for num in nums:
regr=ensemble.RandomForestRegressor(n_estimators=num)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(nums,training_scores,label="Training Score")
ax.plot(nums,testing_scores,label="Testing Score")
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(-1,1)
plt.suptitle("RandomForestRegressor Private")
plt.show()
# test_RandomForestRegressor_num
test_RandomForestRegressor_num(X_train,X_test,y_train,y_test)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train.astype('int'))
X_test_norm = mms.transform(X_test)
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train.astype('int'))
X_test_std = stdsc.transform(X_test)
feat_labels = dataset_features.columns[1:]
forest = RandomForestClassifier(n_estimators=60,
random_state=1)
forest.fit(X_train, y_train.astype('int'))
importances = forest.feature_importances_
print(importances)
[0.01329411 0.00538083 0.25328321 0.53405793 0.00102718 0.01261719 0.02238435 0.02204848 0.022027 0.02245727 0.0221031 0.023237 0.02328034 0.02280201 0. ]
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 60,
feat_labels[indices[f]],
importances[indices[f]]))
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]),
importances[indices],
align='center')
plt.xticks(range(X_train.shape[1]),
feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
#plt.savefig('images/04_09.png', dpi=300)
plt.show()
1) floor_area_sqm 0.534058 2) Floor_no 0.253283 3) distance_npc 0.023280 4) distance_city 0.023237 5) distance_cc 0.022802 6) distance_supermarket 0.022457 7) distance_secondary_school 0.022384 8) distance_hawker 0.022103 9) distance_primary_school 0.022048 10) distance_mrt 0.022027 11) Postal District 0.013294 12) remaining_lease_yrs 0.012617 13) Type_no 0.005381 14) Planning Area_no 0.001027 15) Mature_Estate 0.000000
sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
X_selected = sfm.transform(X_train)
print('Number of features that meet this threshold criterion:',
X_selected.shape[1])
for f in range(X_selected.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[indices[f]],
importances[indices[f]]))
Number of features that meet this threshold criterion: 2 1) floor_area_sqm 0.534058 2) Floor_no 0.253283
##Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
max_features = [.001,.05,.1,.3,.5,.7,.9,.99]#trying a series of parameters
test_scores = []
for max_feat in tqdm(max_features):
clf = RandomForestRegressor(n_estimators = 200,max_features = max_feat)
test_score = np.sqrt(-cross_val_score(clf,X_train,y_train,cv = 5,scoring = 'neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
#result
import matplotlib.pyplot as plt
plt.plot(max_features,test_scores)
plt.title('Max Features vs CV Error')
100%|██████████| 8/8 [04:17<00:00, 32.23s/it]
Text(0.5, 1.0, 'Max Features vs CV Error')
##The possible optimal parameter may be 0.1
from sklearn.ensemble import RandomForestRegressor
rfg = RandomForestRegressor(n_estimators = 200,max_features =0.1)
rfg.fit(X_train, y_train)
y_rfg = rfg.predict(X_test)
submission_df = pd.DataFrame(data= { 'y_test': y_test,'predicted_price': y_rfg})
print(rfg.get_params())
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 0.1, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
submission_df.head()
| y_test | predicted_price | |
|---|---|---|
| 39910 | 1513000.0 | 1.522919e+06 |
| 26685 | 786000.0 | 8.020888e+05 |
| 25718 | 942000.0 | 1.007335e+06 |
| 37916 | 2000000.0 | 2.218186e+06 |
| 24821 | 2680000.0 | 2.484494e+06 |
from numpy import mean
average1 = mean(y_test)
average2 = mean(y_rfg)
max1 = np.max(y_test)
max2 = np.max(y_rfg)
print('Average y_test =', average1)
print('Average predicted_price =', average2)
print('Maximum y_test =', max1)
print('Maximum y_rfg =', max2)
Average y_test = 1672409.735421423 Average predicted_price = 1670308.988822118 Maximum y_test = 39000000.0 Maximum y_rfg = 37124959.78906061
submission_df.to_csv('rfg_private.csv',index=False)
lh = pd.read_csv('Landed_Complete_dataset.csv')
dataset_features = lh[['abs_price', 'Postal District','Type_no' ,'floor_area_sqm', 'Planning Area_no', 'remaining_lease_yrs',
'distance_secondary_school','distance_primary_school', 'distance_mrt', 'distance_supermarket', 'distance_hawker',
'distance_city', 'distance_npc', 'distance_cc','Mature_Estate']]
print(len(dataset_features))
# Only "y varable"
resale_p = dataset_features['abs_price']
# All other indepdendent variables
X = dataset_features[['Postal District','Type_no' ,'floor_area_sqm', 'Planning Area_no', 'remaining_lease_yrs',
'distance_secondary_school','distance_primary_school', 'distance_mrt', 'distance_supermarket', 'distance_hawker',
'distance_city', 'distance_npc', 'distance_cc','Mature_Estate']]
dataset_features.head()
12438
| abs_price | Postal District | Type_no | floor_area_sqm | Planning Area_no | remaining_lease_yrs | distance_secondary_school | distance_primary_school | distance_mrt | distance_supermarket | distance_hawker | distance_city | distance_npc | distance_cc | Mature_Estate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6000000.0 | 2 | 3 | 3193.0 | 5 | 999999 | 0.960972 | 0.684728 | 0.463179 | 0.444501 | 0.104015 | 1.732091 | 0.525199 | 0.145201 | 1 |
| 1 | 6000000.0 | 2 | 3 | 3193.0 | 5 | 999999 | 0.960972 | 0.684728 | 0.463179 | 0.444501 | 0.104015 | 1.732091 | 0.525199 | 0.145201 | 1 |
| 2 | 6380000.0 | 3 | 1 | 5199.0 | 5 | 999999 | 0.901405 | 0.776450 | 0.509438 | 0.203843 | 0.451131 | 5.202830 | 0.391794 | 0.984172 | 1 |
| 3 | 6380000.0 | 3 | 1 | 5199.0 | 5 | 999999 | 0.901405 | 0.776450 | 0.509438 | 0.203843 | 0.451131 | 5.202830 | 0.391794 | 0.984172 | 1 |
| 4 | 5550000.0 | 3 | 1 | 4674.0 | 5 | 999999 | 0.733715 | 0.579922 | 0.360550 | 0.412032 | 0.557248 | 5.071424 | 0.591292 | 0.892015 | 1 |
import seaborn as sns
sns.pairplot(dataset_features,diag_kind='kde')
/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:306: UserWarning: Dataset has 0 variance; skipping density estimate. warnings.warn(msg, UserWarning)
<seaborn.axisgrid.PairGrid at 0x7ff3a934d640>
from sklearn.linear_model import LinearRegression
from sklearn import ensemble
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
models_name=['LinearRegression','GBRT','RandomForest']
models=[]
models.append(LinearRegression())
models.append(ensemble.GradientBoostingRegressor(n_estimators=100))
models.append(ensemble.RandomForestRegressor(n_estimators=200))
np.random.seed(100)
X_train, X_test, y_train, y_test = train_test_split(X, resale_p, test_size=.3, random_state=0)
MSE_lst = []
for i in models:
i.fit(X_train, y_train) # Train the model using the training sets
y_pred = i.predict(X_test) # Make predictions using the testing set
MSE = mean_squared_error(y_test, y_pred) # performance statistic
MSE_lst.append(MSE)
np.random.seed(100)
regre = LinearRegression()
regre.fit(X_train, y_train) # Train the model using the training sets
y_pred_ols = regre.predict(X_test) # Make predictions using the testing set
MSE_ols = mean_squared_error(y_test, y_pred_ols) # performance statistic
MSE_lst.append(MSE_ols)
score_regre = regre.score(X_test,y_test)
print('OLS_MSE =', MSE_ols)
print('OLS_R squared =', score_regre)
from sklearn.ensemble import GradientBoostingRegressor
GradientBoostingRegressor = GradientBoostingRegressor()
GradientBoostingRegressor.fit(X=X_train, y=y_train)
y_pred_GradientBoostingRegressor = GradientBoostingRegressor.predict(X=X_test)
MSE_GradientBoostingRegressor = mean_squared_error(y_test, y_pred_GradientBoostingRegressor)
MSE_lst.append(MSE_GradientBoostingRegressor)
score_GBR = GradientBoostingRegressor.score(X_test,y_test)
print('GBR_MSE =', MSE_GradientBoostingRegressor)
print('GBR_R squared =', score_GBR)
from sklearn.ensemble import RandomForestRegressor
RandomForestRegressor = RandomForestRegressor()
RandomForestRegressor.fit(X=X_train, y=y_train)
y_pred_RandomForestRegressor = RandomForestRegressor.predict(X=X_test)
MSE_RandomForestRegressor = mean_squared_error(y_test, y_pred_RandomForestRegressor)
MSE_lst.append(MSE_RandomForestRegressor)
score_RFR = RandomForestRegressor.score(X_test,y_test)
print('RFR_MSE =',MSE_RandomForestRegressor)
print('RFR_R squared =', score_RFR)
OLS_MSE = 4688531784791.673 OLS_R squared = 0.8473243490863328 GBR_MSE = 3570250938931.6196 GBR_R squared = 0.8837396415238898 RFR_MSE = 5911945954969.158 RFR_R squared = 0.8074855331536492
import matplotlib.pyplot as plt
plt.figure(figsize=(8,18))
for i in range(len(models)):
plt.subplot(311+i)
ax=plt.gca()
y_pred=models[i].predict(X_test)
ax.plot(range(len(y_test[:100])), y_test[:100],'k--',label='Actual')
ax.plot(range(len(y_pred[:100])), y_pred[:100], 'k',label='Predicted')
plt.title('%s'%models_name[i])
RandomForestRegressor.fit(X_train,y_train)
y_train_pred=RandomForestRegressor.predict(X_train)
y_test_pred=RandomForestRegressor.predict(X_test)
#Section 3: Evaluate model performance via MSE and R2_Score
from sklearn.metrics import mean_squared_error,r2_score
print("MSE Train: %.3f, Test: %.3f" % (mean_squared_error(y_train,y_train_pred),
mean_squared_error(y_test,y_test_pred)))
print("R2_Score Train: %.3f, Test: %.3f" % (r2_score(y_train,y_train_pred),
r2_score(y_test,y_test_pred)))
#Visualize the residuals of the prediction
plt.scatter(y_train_pred,y_train_pred-y_train,
c='steelblue',
edgecolor='white',
marker='o',
s=35,
alpha=0.9,
label='Training Data')
plt.scatter(y_test_pred,y_test_pred-y_test,
c='limegreen',
edgecolor='white',
marker='s',
s=30,
alpha=0.9,
label='Test Data')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Random Forest Residual plot for Landed")
plt.legend(loc='upper left')
plt.hlines(y=0,xmin=0,xmax=41000000,lw=2,color='black')
plt.xlim([0,41000000])
plt.show()
MSE Train: 293731609518.639, Test: 6225082157746.746 R2_Score Train: 0.991, Test: 0.797
def test_RandomForestRegressor_num(*data):
X_train,X_test,y_train,y_test=data
nums=np.arange(1,100,step=2)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
testing_scores=[]
training_scores=[]
for num in nums:
regr=ensemble.RandomForestRegressor(n_estimators=num)
regr.fit(X_train,y_train)
training_scores.append(regr.score(X_train,y_train))
testing_scores.append(regr.score(X_test,y_test))
ax.plot(nums,training_scores,label="Training Score")
ax.plot(nums,testing_scores,label="Testing Score")
ax.set_xlabel("estimator num")
ax.set_ylabel("score")
ax.legend(loc="lower right")
ax.set_ylim(-1,1)
plt.suptitle("RandomForestRegressor Landed")
plt.show()
# test_RandomForestRegressor_num
test_RandomForestRegressor_num(X_train,X_test,y_train,y_test)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train.astype('int'))
X_test_norm = mms.transform(X_test)
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train.astype('int'))
X_test_std = stdsc.transform(X_test)
feat_labels = dataset_features.columns[1:]
forest = RandomForestClassifier(n_estimators=100,
random_state=1)
forest.fit(X_train, y_train.astype('int'))
importances = forest.feature_importances_
print(importances)
[0.02614677 0.02416202 0.4552524 0.0007066 0.02978284 0.05717202 0.05753524 0.05813957 0.05875736 0.05842082 0.05798599 0.058104 0.05783438 0. ]
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 60,
feat_labels[indices[f]],
importances[indices[f]]))
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]),
importances[indices],
align='center')
plt.xticks(range(X_train.shape[1]),
feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
#plt.savefig('images/04_09.png', dpi=300)
plt.show()
1) floor_area_sqm 0.455252 2) distance_supermarket 0.058757 3) distance_hawker 0.058421 4) distance_mrt 0.058140 5) distance_npc 0.058104 6) distance_city 0.057986 7) distance_cc 0.057834 8) distance_primary_school 0.057535 9) distance_secondary_school 0.057172 10) remaining_lease_yrs 0.029783 11) Postal District 0.026147 12) Type_no 0.024162 13) Planning Area_no 0.000707 14) Mature_Estate 0.000000
sfm = SelectFromModel(forest, threshold=0.1, prefit=True)
X_selected = sfm.transform(X_train)
print('Number of features that meet this threshold criterion:',
X_selected.shape[1])
for f in range(X_selected.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30,
feat_labels[indices[f]],
importances[indices[f]]))
Number of features that meet this threshold criterion: 1 1) floor_area_sqm 0.455252
##Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
max_features = [.001,.1,.3,.5,.7,.9,.99]#trying a series of parameters
test_scores = []
for max_feat in max_features:
clf = RandomForestRegressor(n_estimators = 200,max_features = max_feat)
test_score = np.sqrt(-cross_val_score(clf,X_train,y_train,cv = 5,scoring = 'neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
#result
import matplotlib.pyplot as plt
plt.plot(max_features,test_scores)
plt.title('Max Features vs CV Error')
Text(0.5, 1.0, 'Max Features vs CV Error')
#The optimal max features may be 0.3
from sklearn.ensemble import RandomForestRegressor
rfg = RandomForestRegressor(n_estimators = 200,max_features =0.3)
rfg.fit(X_train, y_train)
y_rfg = rfg.predict(X_test)
submission_df = pd.DataFrame(data= { 'y_test': y_test,'predicted_price': y_rfg})
print(rfg.get_params())
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 0.3, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
submission_df.head()
| y_test | predicted_price | |
|---|---|---|
| 8076 | 2500000.0 | 2.526972e+06 |
| 11068 | 1800000.0 | 1.855083e+06 |
| 10386 | 2550000.0 | 2.623833e+06 |
| 2476 | 10580000.0 | 1.051162e+07 |
| 11425 | 3198000.0 | 2.995820e+06 |
from numpy import mean
average1 = mean(y_test)
average2 = mean(y_rfg)
max1 = np.max(y_test)
max2 = np.max(y_rfg)
print('Average y_test =', average1)
print('Average predicted_price =', average2)
print('Maximum y_test =', max1)
print('Maximum y_rfg =', max2)
Average y_test = 4831878.553330734 Average predicted_price = 4868299.330290501 Maximum y_test = 93900000.0 Maximum y_rfg = 74061005.80348484
submission_df.to_csv('rfg_landed.csv',index=False)